Libraries
library(rtracklayer)
library(GenomicFeatures)
library(txdbmaker)
library(Biostrings)
library(BSgenome)
library(BSgenomeForge)
library(dplyr)
library(GenomicRanges)
library(GenomeInfoDb)
library(Biostrings)
library(BSgenome)
library(ggplot2)
library(scales)
library(ggpointdensity)
Loading data - Loading seqs + filtering
library(BSgenome.Dlaeve.NCBI.dlgm)
genome <- BSgenome.Dlaeve.NCBI.dlgm
print(class(genome))
length(genome)
genome
# names present in your BSgenome
nm <- seqnames(genome)
# choose only proper chromosomes (e.g. chr1..chr31), ignore the HiC_scaffold zoo
keep <- intersect(paste0("chr", 1:31), nm) # safe even if some are missing
# grab them quickly as a DNAStringSet (lightweight, fast)
genome_chr <- getSeq(genome, keep)
# sanity check
length(genome_chr); head(names(genome_chr))
transposons <- import("C:/Users/rafae/OneDrive/Desktop/Genomics/Data2/Data3/Data4/HiTE_intact.CORRECTED.gff3")
transposons
… [30 líneas truncadas]
# names you want, taken from your FASTA grab (chr1..chr31)
wanted <- seqlevels(genome_chr)
# if styles differ, normalize once (e.g., add/remove "chr")
seqlevelsStyle(transposons) <- seqlevelsStyle(genome_chr)
# keep only those seqlevels and prune ranges on others
transposons_chr <- keepSeqlevels(transposons, wanted, pruning.mode = "coarse")
# order levels identically to genome, then copy seqinfo
seqlevels(transposons_chr) <- wanted
seqinfo(transposons_chr) <- seqinfo(genome_chr)[wanted]
# sanity:
seqlevels(transposons_chr) # should be chr1..chr31 only
geneinfo <- mcols(transposons_chr)[c("id","source")]
geneinfo
seqinfo(transposons_chr) <- seqinfo(genome_chr)
transposons_chr <- trim(transposons_chr)
transposons_seqs <- getSeq(genome_chr, transposons_chr)
Body
length(transposons_seqs)
transposons_seqs
Counting
letterFrequency_all <- letterFrequency(transposons_seqs, c("A", "T", "C", "G")) %>% as.data.frame %>% tibble()
letterFrequency_all
C_all <- letterFrequency_all[3]
G_all <- letterFrequency_all[4]
Width_all <- width(transposons_seqs)
CG_all <- dinucleotideFrequency(transposons_seqs)[,"CG"]
OE_all = (CG_all*Width_all)/(C_all*G_all)
colnames(OE_all) <- c("OEindex")
OE_all
cgContent_all = (C_all+G_all)/Width_all
colnames(cgContent_all) <- c("CGpercentage")
cgContent_all
df <- cbind(OE_all,cgContent_all)
df
Graphs
x <- OE_all$OEindex
x <- x[is.finite(x)]
bw <- 2 * IQR(x) / length(x)^(1/3)
if (!is.finite(bw) || bw <= 0) bw <- diff(range(x)) / 80
xmin <- floor(min(x) * 10) / 10
xmax <- ceiling(max(x) * 10) / 10
ggplot(OE_all, aes(x = OEindex)) +
geom_histogram(aes(y = after_stat(density)),
binwidth = bw, boundary = 0, closed = "left",
fill = "#4C7BA8", color = "white", alpha = 0.95) +
geom_density(linewidth = 0.9, color = "black", alpha = 0.6) +
scale_x_continuous(
breaks = seq(xmin, xmax, by = 0.1),
minor_breaks = seq(xmin, xmax, by = 0.05),
labels = number_format(accuracy = 0.1),
expand = expansion(mult = c(0.01, 0.02))
) +
labs(
title = "O/E CpG Ratio Distribution — Intact transposons (Body)",
x = "O/E index",
y = "Density"
) +
theme_minimal(base_size = 13) +
theme(
panel.grid.minor = element_line(),
plot.title = element_text(face = "bold")
)
x <- cgContent_all$CGpercentage
x <- x[is.finite(x)]
bw <- 2 * IQR(x) / length(x)^(1/3)
if (!is.finite(bw) || bw <= 0) bw <- diff(range(x)) / 80
xmin <- floor(min(x) * 10) / 10
xmax <- ceiling(max(x) * 10) / 10
ggplot(cgContent_all, aes(x = CGpercentage)) +
geom_histogram(aes(y = after_stat(density)),
binwidth = bw, boundary = 0, closed = "left",
fill = "#4C7BA8", color = "white", alpha = 0.95) +
geom_density(linewidth = 0.9, color = "black", alpha = 0.6) +
scale_x_continuous(
breaks = seq(xmin, xmax, by = 0.1),
minor_breaks = seq(xmin, xmax, by = 0.05),
labels = number_format(accuracy = 0.1),
expand = expansion(mult = c(0.01, 0.02))
) +
labs(
title = "CG% — Intact transposons (Body)",
x = "CG%",
y = "Density"
) +
theme_minimal(base_size = 13) +
theme(
panel.grid.minor = element_line(),
plot.title = element_text(face = "bold")
)
plot_nice_dark <- function(d, x="OEindex", y="CGpercentage",
xthr=.65, ythr=.55, title="Scatter - Intact transposons - Body",
point_col="#10b981", thr_col="#7f1d1d"){ # green, violet
dd <- data.frame(x=as.numeric(d[[x]]), y=as.numeric(d[[y]]))
dd <- dd[is.finite(dd$x) & is.finite(dd$y), ]
ggplot(dd, aes(x, y)) +
geom_point(color=point_col, alpha=.75, size=1.6,
position=position_jitter(width=.004, height=0)) +
geom_vline(xintercept=xthr, color=thr_col, linewidth=.6, linetype="dotted") +
geom_hline(yintercept=ythr, color=thr_col, linewidth=.6, linetype="dotted") +
labs(title=title, x="O/E index", y="CG (%)") +
theme_minimal(base_size=12) +
theme(
plot.background = element_rect(fill="black", color=NA),
panel.background = element_rect(fill="black", color=NA),
panel.grid = element_line(color="#242424"),
axis.text = element_text(color="white"),
axis.title = element_text(color="white"),
plot.title = element_text(color="white", face="bold")
)
}
# use:
plot_nice_dark(df)
PCA
# 1) Clean: keep numeric cols, drop NA/Inf rows, drop zero-variance cols
X <- df[, sapply(df, is.numeric), drop = FALSE]
X <- X[Reduce(`&`, lapply(X, is.finite)), , drop = FALSE]
X <- X[, sapply(X, function(z) sd(z) > 0), drop = FALSE]
# 2) PCA
pca <- prcomp(X, center = TRUE, scale. = TRUE)
pc <- as.data.frame(pca$x[, 1:2, drop = FALSE])
ggplot(pc, aes(PC1, PC2)) +
geom_point(alpha = .7, size = 1.6, color = "#10b981") +
theme_minimal(base_size = 13) +
labs(title = "PCA of body", x = "PC1", y = "PC2")
summary(pca)
cor(df$OEindex, df$CGpercentage)
K-Means
# clean once; keep the mask so we can put labels back correctly
X <- df[, c("OEindex","CGpercentage")]
m <- is.finite(X$OEindex) & is.finite(X$CGpercentage)
Xc <- scale(X[m, , drop=FALSE])
set.seed(42)
k <- 7
km <- kmeans(Xc, centers=k, nstart=50)
df$cluster <- NA_integer_
df$cluster[m] <- km$cluster
df$cluster <- factor(df$cluster)
ggplot(df[!is.na(df$cluster),], aes(OEindex, CGpercentage, color=cluster)) +
geom_point(alpha=.75, size=1.7) +
theme_minimal(base_size=13) +
labs(title=sprintf("k-means (k=%d) Body", k),
x="O/E index", y="CG (%)")
#Add names
df <- cbind(geneinfo, df)
df
#Body list with cgi arquitecture
df_filtered_cgi_body <- dplyr::filter(as.data.frame(df), OEindex >= 0.65, CGpercentage >= .55)
df_filtered_cgi_body
C_all_sum <- sum(C_all)
G_all_sum <- sum(G_all)
Width_all_sum <- sum(width(transposons_seqs))
CG_all_sum <- sum(CG_all)
C_all_sum <- as.numeric(C_all_sum)
G_all_sum <- as.numeric(G_all_sum)
Width_all_sum <- as.numeric(Width_all_sum)
CG_all_sum <- as.numeric(CG_all_sum)
#CG% sum - Body
cgContent_all_sum <- (C_all_sum+G_all_sum)/Width_all_sum
cgContent_all_sum
#OE Index sum - Body
OE_all_sum <- (CG_all_sum*Width_all_sum)/(C_all_sum*G_all_sum)
OE_all_sum
#Body sums totals
C_all_sum
G_all_sum
Width_all_sum
CG_all_sum
cgContent_all_sum
OE_all_sum
#Porcentaje/Ratio de GCs en el cuerpo segun el span del cuerpo
CG_all_sum/Width_all_sum
#Porcentaje/Ratio de GC del cuerpo en comparacion al genoma
genomeCG_totalcount = 45535149
CG_all_sum/45535149
PROMOTORES
promotores_500bp <- promoters(transposons_chr, upstream = 2000, downstream = 0)
seqlevelsStyle(promotores_500bp) <- seqlevelsStyle(genome)
promotores_500bp <- keepSeqlevels(
promotores_500bp,
intersect(seqlevels(promotores_500bp), seqlevels(genome)),
pruning.mode = "coarse"
)
seqinfo(promotores_500bp) <- seqinfo(genome)[seqlevels(promotores_500bp)]
promotores_500bp <- trim(promotores_500bp)
promotores_500bp <- promotores_500bp[width(promotores_500bp) > 0]
starts <- start(promotores_500bp)
ends <- end(promotores_500bp)
lens <- seqlengths(promotores_500bp)[as.character(seqnames(promotores_500bp))]
start(promotores_500bp) <- pmax(1L, starts)
end(promotores_500bp) <- pmin(ends, lens)
seqs_promotores_500bp <- getSeq(genome_chr, promotores_500bp)
length(seqs_promotores_500bp)
seqs_promotores_500bp
Counting
letterFrequency_all_promoters <- letterFrequency(seqs_promotores_500bp, c("A", "T", "C", "G")) %>% as.data.frame %>% tibble()
letterFrequency_all_promoters
C_all_promoters <- letterFrequency_all_promoters[3]
G_all_promoters <- letterFrequency_all_promoters[4]
Width_all_promoters <- width(seqs_promotores_500bp)
CG_all_promoters <- dinucleotideFrequency(seqs_promotores_500bp)[,"CG"]
C_all_promoters
G_all_promoters
Width_all_promoters
… [6226 líneas truncadas]
CG_all_promoters
… [4976 líneas truncadas]
OE_all_promoters = (CG_all_promoters*Width_all_promoters)/(C_all_promoters*G_all_promoters)
colnames(OE_all_promoters) <- c("OEindex")
OE_all_promoters
cgContent_all_promoters = (C_all_promoters+G_all_promoters)/Width_all_promoters
colnames(cgContent_all_promoters) <- c("CGpercentage")
cgContent_all_promoters
df_promoters <- cbind(OE_all_promoters,cgContent_all_promoters)
df_promoters
Graphs
x <- OE_all_promoters$OEindex
x <- x[is.finite(x)]
bw <- 2 * IQR(x) / length(x)^(1/3)
if (!is.finite(bw) || bw <= 0) bw <- diff(range(x)) / 80
xmin <- floor(min(x) * 10) / 10
xmax <- ceiling(max(x) * 10) / 10
ggplot(OE_all_promoters, aes(x = OEindex)) +
geom_histogram(aes(y = after_stat(density)),
binwidth = bw, boundary = 0, closed = "left",
fill = "#4C7BA8", color = "white", alpha = 0.95) +
geom_density(linewidth = 0.9, color = "black", alpha = 0.6) +
scale_x_continuous(
breaks = seq(xmin, xmax, by = 0.1),
minor_breaks = seq(xmin, xmax, by = 0.05),
labels = number_format(accuracy = 0.1),
expand = expansion(mult = c(0.01, 0.02))
) +
labs(
title = "O/E CpG Ratio Distribution — Intact transposons (Promoters)",
x = "O/E index",
y = "Density"
) +
theme_minimal(base_size = 13) +
theme(
panel.grid.minor = element_line(),
plot.title = element_text(face = "bold")
)
x <- cgContent_all_promoters$CGpercentage
x <- x[is.finite(x)]
bw <- 2 * IQR(x) / length(x)^(1/3)
if (!is.finite(bw) || bw <= 0) bw <- diff(range(x)) / 80
xmin <- floor(min(x) * 10) / 10
xmax <- ceiling(max(x) * 10) / 10
ggplot(cgContent_all_promoters, aes(x = CGpercentage)) +
geom_histogram(aes(y = after_stat(density)),
binwidth = bw, boundary = 0, closed = "left",
fill = "#4C7BA8", color = "white", alpha = 0.95) +
geom_density(linewidth = 0.9, color = "black", alpha = 0.6) +
scale_x_continuous(
breaks = seq(xmin, xmax, by = 0.1),
minor_breaks = seq(xmin, xmax, by = 0.05),
labels = number_format(accuracy = 0.1),
expand = expansion(mult = c(0.01, 0.02))
) +
labs(
title = "CG% — Intact transposons (Promoters)",
x = "CG%",
y = "Density"
) +
theme_minimal(base_size = 13) +
theme(
panel.grid.minor = element_line(),
plot.title = element_text(face = "bold")
)
plot_nice_dark <- function(d, x="OEindex", y="CGpercentage",
xthr=.65, ythr=.55, title="Scatter - Intact transposons - Promoters",
point_col="#10b981", thr_col="#7f1d1d"){ # green, violet
dd <- data.frame(x=as.numeric(d[[x]]), y=as.numeric(d[[y]]))
dd <- dd[is.finite(dd$x) & is.finite(dd$y), ]
ggplot(dd, aes(x, y)) +
geom_point(color=point_col, alpha=.75, size=1.6,
position=position_jitter(width=.004, height=0)) +
geom_vline(xintercept=xthr, color=thr_col, linewidth=.6, linetype="dotted") +
geom_hline(yintercept=ythr, color=thr_col, linewidth=.6, linetype="dotted") +
labs(title=title, x="O/E index", y="CG (%)") +
theme_minimal(base_size=12) +
theme(
plot.background = element_rect(fill="black", color=NA),
panel.background = element_rect(fill="black", color=NA),
panel.grid = element_line(color="#242424"),
axis.text = element_text(color="white"),
axis.title = element_text(color="white"),
plot.title = element_text(color="white", face="bold")
)
}
# use:
plot_nice_dark(df_promoters)
PCA
# 1) Clean: keep numeric cols, drop NA/Inf rows, drop zero-variance cols
X <- df_promoters[, sapply(df_promoters, is.numeric), drop = FALSE]
X <- X[Reduce(`&`, lapply(X, is.finite)), , drop = FALSE]
X <- X[, sapply(X, function(z) sd(z) > 0), drop = FALSE]
# 2) PCA
pca <- prcomp(X, center = TRUE, scale. = TRUE)
pc <- as.data.frame(pca$x[, 1:2, drop = FALSE])
ggplot(pc, aes(PC1, PC2)) +
geom_point(alpha = .7, size = 1.6, color = "#10b981") +
theme_minimal(base_size = 13) +
labs(title = "PCA Promoters", x = "PC1", y = "PC2")
summary(pca)
cor(df$OEindex, df$CGpercentage)
K-Means
# clean once; keep the mask so we can put labels back correctly
X <- df_promoters[, c("OEindex","CGpercentage")]
m <- is.finite(X$OEindex) & is.finite(X$CGpercentage)
Xc <- scale(X[m, , drop=FALSE])
set.seed(42)
k <- 2
km <- kmeans(Xc, centers=k, nstart=50)
df$cluster <- NA_integer_
df$cluster[m] <- km$cluster
df$cluster <- factor(df$cluster)
ggplot(df[!is.na(df$cluster),], aes(OEindex, CGpercentage, color=cluster)) +
geom_point(alpha=.75, size=1.7) +
theme_minimal(base_size=13) +
labs(title=sprintf("k-means (k=%d) Promoters", k),
x="O/E index", y="CG (%)")
#Add names
df_promoters <- cbind(geneinfo, df_promoters)
df_promoters
#Promoters list with cgi arquitecture
df_filtered_cgi_promoters <- dplyr::filter(as.data.frame(df_promoters), OEindex >= 0.65, CGpercentage >= .55)
df_filtered_cgi_promoters
Promoters Sum Count
C_all_sum_promoters <- sum(C_all_promoters)
G_all_sum_promoters <- sum(G_all_promoters)
Width_all_sum_promoters <- sum(width(seqs_promotores_500bp))
CG_all_sum_promoters <- sum(CG_all_promoters)
C_all_sum_promoters <- as.numeric(C_all_sum_promoters)
G_all_sum_promoters <- as.numeric(G_all_sum_promoters)
Width_all_sum_promoters <- as.numeric(Width_all_sum_promoters)
CG_all_sum_promoters <- as.numeric(CG_all_sum_promoters)
C_all_sum_promoters
G_all_sum_promoters
Width_all_sum_promoters
CG_all_sum_promoters
#CG% sum - Body
cgContent_all_sum_promoters <- (C_all_sum_promoters+G_all_sum_promoters)/Width_all_sum_promoters
cgContent_all_sum_promoters
#OE Index sum - Body
OE_all_sum_promoters <- (CG_all_sum_promoters*Width_all_sum_promoters)/(C_all_sum_promoters*G_all_sum_promoters)
OE_all_sum_promoters
#Body sums totals
C_all_sum_promoters
G_all_sum_promoters
Width_all_sum_promoters
CG_all_sum_promoters
cgContent_all_sum_promoters
OE_all_sum_promoters
#Porcentaje/Ratio de GCs en el cuerpo segun el span del promoters
CG_all_sum_promoters/Width_all_sum_promoters
#Porcentaje/Ratio de GC del cuerpo en comparacion al genoma
genomeCG_totalcount = 45535149
CG_all_sum_promoters/45535149